{
 "metadata": {
  "name": "",
  "signature": "sha256:dbba45689f23842648309c4a39cd1ee227f5a770ba34b5efbd7bec23856ef28d"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import os\n",
      "os.listdir('.')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 1,
       "text": [
        "['.ipynb_checkpoints', 'alexa', 'read_from_json.ipynb', 'scrapy.cfg']"
       ]
      }
     ],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "fname = 'alexa/cn.json'\n",
      "with open(fname) as f:\n",
      "    import json\n",
      "    alexa_items = []\n",
      "    for line in f:\n",
      "        # print line, type(line)\n",
      "        alexa_item = json.loads(line)\n",
      "        # print alexa_item, type(alexa_item)\n",
      "        alexa_items.append(alexa_item)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "len(alexa_items)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 3,
       "text": [
        "97222"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(alexa_items[0])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "{u'url': u'/siteinfo/taobao.com', u'category': u'World/Chinese_Simplified_CN/\\u8d2d\\u7269', u'name': u'Taobao.com', u'description': u\"Launched in May 2003, Taobao Marketplace (www.taobao.com) is the online shopping destination of choice for Chinese consumers looking for wide selection, value and convenience. Shoppers choose from a wide range of products and services on Taobao Marketplace, which features hundreds of millions of product and service listings. Taobao Marketplace was China's largest online shopping destination in terms of gross merchandise volume in 2013, according to iResearch. Taobao Marketplace is a business within Alibaba Group.\"}\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "category_items = {}\n",
      "for item in alexa_items:\n",
      "    c = item['category']\n",
      "    if c not in category_items:\n",
      "        category_items[c] = []\n",
      "    category_items[c].append(item)\n",
      "print(len(category_items))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "4321\n"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "counts = {}\n",
      "for category, items in category_items.items():\n",
      "    counts[category] = len(items)\n",
      "import operator\n",
      "sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)\n",
      "for category, counts in sorted_counts[:10]:\n",
      "    print category, counts"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "World/Chinese_Simplified_CN/\u6e38\u620f 525\n",
        "\u4e9a\u6d32/\u4e2d\u56fd/\u6d59\u6c5f 525\n",
        "\u53c2\u8003/\u6559\u80b2/\u5927\u4e13\u9662\u6821\u4e0e\u7814\u7a76\u6240 525\n",
        "\u4e9a\u6d32/\u4e2d\u56fd/\u56db\u5ddd 525\n",
        "\u4e9a\u6d32/\u4e2d\u56fd/\u6c5f\u82cf 525\n",
        "Chinese_Simplified_CN/\u53c2\u8003/\u6559\u80b2 525\n",
        "\u4e2d\u56fd/\u6d59\u6c5f/\u5b81\u6ce2 525\n",
        "World/Chinese_Simplified_CN/\u5546\u4e1a 525\n",
        "\u4e9a\u6d32/\u4e2d\u56fd/\u4e0a\u6d77 525\n",
        "\u4e9a\u6d32/\u4e2d\u56fd/\u5317\u4eac 525\n"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for k in sorted(category_items, key=lambda k: len(category_items[k]), reverse=True)[:10]:\n",
      "    print k, len(category_items[k])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "World/Chinese_Simplified_CN/\u6e38\u620f 525\n",
        "\u4e9a\u6d32/\u4e2d\u56fd/\u6d59\u6c5f 525\n",
        "Chinese_Simplified_CN/\u8ba1\u7b97\u673a/\u4e92\u8054\u7f51\u7edc 525\n",
        "\u4e9a\u6d32/\u4e2d\u56fd/\u56db\u5ddd 525\n",
        "\u4e9a\u6d32/\u4e2d\u56fd/\u6c5f\u82cf 525\n",
        "Chinese_Simplified_CN/\u53c2\u8003/\u6559\u80b2 525\n",
        "\u4e2d\u56fd/\u6d59\u6c5f/\u5b81\u6ce2 525\n",
        "World/Chinese_Simplified_CN/\u5546\u4e1a 525\n",
        "\u4e9a\u6d32/\u4e2d\u56fd/\u4e0a\u6d77 525\n",
        "\u4e9a\u6d32/\u4e2d\u56fd/\u5317\u4eac 525\n"
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}